In [185]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn import preprocessing
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
In [187]:
df = pd.read_csv("heart_cleveland_upload.csv")
In [189]:
df
Out[189]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal condition
0 69 1 0 160 234 1 2 131 0 0.1 1 1 0 0
1 69 0 0 140 239 0 0 151 0 1.8 0 2 0 0
2 66 0 0 150 226 0 0 114 0 2.6 2 0 0 0
3 65 1 0 138 282 1 2 174 0 1.4 1 1 0 1
4 64 1 0 110 211 0 2 144 1 1.8 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
292 40 1 3 152 223 0 0 181 0 0.0 0 0 2 1
293 39 1 3 118 219 0 0 140 0 1.2 1 0 2 1
294 35 1 3 120 198 0 0 130 1 1.6 1 0 2 1
295 35 0 3 138 183 0 0 182 0 1.4 0 0 0 0
296 35 1 3 126 282 0 2 156 1 0.0 0 0 2 1

297 rows × 14 columns

In [191]:
df.describe
Out[191]:
<bound method NDFrame.describe of      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     69    1   0       160   234    1        2      131      0      0.1   
1     69    0   0       140   239    0        0      151      0      1.8   
2     66    0   0       150   226    0        0      114      0      2.6   
3     65    1   0       138   282    1        2      174      0      1.4   
4     64    1   0       110   211    0        2      144      1      1.8   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
292   40    1   3       152   223    0        0      181      0      0.0   
293   39    1   3       118   219    0        0      140      0      1.2   
294   35    1   3       120   198    0        0      130      1      1.6   
295   35    0   3       138   183    0        0      182      0      1.4   
296   35    1   3       126   282    0        2      156      1      0.0   

     slope  ca  thal  condition  
0        1   1     0          0  
1        0   2     0          0  
2        2   0     0          0  
3        1   1     0          1  
4        1   0     0          0  
..     ...  ..   ...        ...  
292      0   0     2          1  
293      1   0     2          1  
294      1   0     2          1  
295      0   0     0          0  
296      0   0     2          1  

[297 rows x 14 columns]>

Data Visualization¶

In [194]:
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(9, 6))
sns.heatmap(corr, mask=mask)
plt.title('Correlation Heatmap', fontsize=15)
plt.show()
No description has been provided for this image

Pre-Processing¶

In [197]:
df.isna().sum()
Out[197]:
age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64
In [199]:
df = df.drop(columns = ['condition'])
##standardization
standardized_df = pd.DataFrame(preprocessing.StandardScaler().fit_transform(df),columns = df.columns)
standardized_df.head()
Out[199]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
0 1.600302 0.691095 -2.240629 1.596266 -0.257179 2.430427 1.010199 -0.812095 -0.696419 -0.820813 0.643781 0.344824 -0.874292
1 1.600302 -1.446980 -2.240629 0.468418 -0.160859 -0.411450 -1.003419 0.061157 -0.696419 0.639470 -0.976583 1.411625 -0.874292
2 1.268242 -1.446980 -2.240629 1.032342 -0.411292 -0.411450 -1.003419 -1.554358 -0.696419 1.326662 2.264145 -0.721976 -0.874292
3 1.157555 0.691095 -2.240629 0.355633 0.667499 2.430427 1.010199 1.065396 -0.696419 0.295874 0.643781 0.344824 -0.874292
4 1.046868 0.691095 -2.240629 -1.223355 -0.700254 -0.411450 1.010199 -0.244481 1.435916 0.639470 0.643781 -0.721976 -0.874292
In [201]:
corr = df.corr()
print(corr)
               age       sex        cp  trestbps      chol       fbs  \
age       1.000000 -0.092399  0.110471  0.290476  0.202644  0.132062   
sex      -0.092399  1.000000  0.008908 -0.066340 -0.198089  0.038850   
cp        0.110471  0.008908  1.000000 -0.036980  0.072088 -0.057663   
trestbps  0.290476 -0.066340 -0.036980  1.000000  0.131536  0.180860   
chol      0.202644 -0.198089  0.072088  0.131536  1.000000  0.012708   
fbs       0.132062  0.038850 -0.057663  0.180860  0.012708  1.000000   
restecg   0.149917  0.033897  0.063905  0.149242  0.165046  0.068831   
thalach  -0.394563 -0.060496 -0.339308 -0.049108 -0.000075 -0.007842   
exang     0.096489  0.143581  0.377525  0.066691  0.059339 -0.000893   
oldpeak   0.197123  0.106567  0.203244  0.191243  0.038596  0.008311   
slope     0.159405  0.033345  0.151079  0.121172 -0.009215  0.047819   
ca        0.362210  0.091925  0.235644  0.097954  0.115945  0.152086   
thal      0.120795  0.370556  0.266275  0.130612  0.023441  0.051038   

           restecg   thalach     exang   oldpeak     slope        ca      thal  
age       0.149917 -0.394563  0.096489  0.197123  0.159405  0.362210  0.120795  
sex       0.033897 -0.060496  0.143581  0.106567  0.033345  0.091925  0.370556  
cp        0.063905 -0.339308  0.377525  0.203244  0.151079  0.235644  0.266275  
trestbps  0.149242 -0.049108  0.066691  0.191243  0.121172  0.097954  0.130612  
chol      0.165046 -0.000075  0.059339  0.038596 -0.009215  0.115945  0.023441  
fbs       0.068831 -0.007842 -0.000893  0.008311  0.047819  0.152086  0.051038  
restecg   1.000000 -0.072290  0.081874  0.113726  0.135141  0.129021  0.013612  
thalach  -0.072290  1.000000 -0.384368 -0.347640 -0.389307 -0.268727 -0.258386  
exang     0.081874 -0.384368  1.000000  0.289310  0.250572  0.148232  0.323268  
oldpeak   0.113726 -0.347640  0.289310  1.000000  0.579037  0.294452  0.336809  
slope     0.135141 -0.389307  0.250572  0.579037  1.000000  0.109761  0.260096  
ca        0.129021 -0.268727  0.148232  0.294452  0.109761  1.000000  0.248825  
thal      0.013612 -0.258386  0.323268  0.336809  0.260096  0.248825  1.000000  
In [203]:
threshold = 0.5

high_corr = corr[(corr.abs() > threshold) & (corr != 1.0)]

plt.figure(figsize=(10, 8))
sns.heatmap(high_corr, annot=True, cmap="coolwarm", mask=high_corr.isnull())
sns.set_style('dark')
plt.title("Heatmap of High Correlations")
plt.show()
No description has been provided for this image
In [205]:
pca = PCA(n_components=0.95)
pca.fit(standardized_df)

plt.plot(pca.explained_variance_ratio_) 

plt.xlabel('Component Number')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance By Component')

plt.show()
No description has been provided for this image
In [207]:
pca = PCA(n_components = 3, random_state = 17) 
pca_df = pd.DataFrame(pca.fit_transform(standardized_df))
pca_df.shape
Out[207]:
(297, 3)

K-Means Clustering¶

In [210]:
inertias = []

for i in range(1,11):
    kmeans = KMeans(n_clusters=i, random_state=5)
    clusters = kmeans.fit_predict(pca_df)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
No description has been provided for this image
In [212]:
silhouette_scores = []

for i in range(2,11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    clusters = kmeans.fit_predict(pca_df)
    silhouette_scores.append(silhouette_score(pca_df, clusters))

plt.plot(range(2,11), silhouette_scores, marker='x')
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Score')
plt.show()
No description has been provided for this image
In [214]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(pca_df)
standardized_df["Clusters"] = kmeans.predict(pca_df)
In [216]:
standardized_df['Clusters'].value_counts()
Out[216]:
Clusters
2    117
1     97
0     83
Name: count, dtype: int64
In [218]:
subset = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

axs = axs.flatten()

i = 0

for column in subset:
    sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
    axs[i].set_title(f'Box Plot of {column}')
    axs[i].set_xlabel('Clusters')
    axs[i].set_ylabel(column)

    i = i + 1

plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[218], line 9
      6 i = 0
      8 for column in subset:
----> 9     sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
     10     axs[i].set_title(f'Box Plot of {column}')
     11     axs[i].set_xlabel('Clusters')

IndexError: index 9 is out of bounds for axis 0 with size 9
No description has been provided for this image
In [220]:
sns.pairplot(standardized_df, hue='Clusters', vars=subset)
plt.show()
No description has been provided for this image
In [221]:
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(pca_df)
standardized_df["Clusters"] = kmeans.predict(pca_df)
In [222]:
standardized_df['Clusters'].value_counts()
Out[222]:
Clusters
2    90
1    85
3    63
0    59
Name: count, dtype: int64
In [223]:
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

axs = axs.flatten()

i = 0

for column in subset:
    sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
    axs[i].set_title(f'Box Plot of {column}')
    axs[i].set_xlabel('Clusters')
    axs[i].set_ylabel(column)

    i = i + 1

plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[223], line 8
      5 i = 0
      7 for column in subset:
----> 8     sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
      9     axs[i].set_title(f'Box Plot of {column}')
     10     axs[i].set_xlabel('Clusters')

IndexError: index 9 is out of bounds for axis 0 with size 9
No description has been provided for this image
In [ ]:
sns.pairplot(standardized_df, hue='Clusters', vars=subset)
plt.show()
In [226]:
plt.figure(figsize=(20, 7))
plt.title("Dendrogram")
dend = shc.dendrogram(shc.linkage(pca_df, method='average'),truncate_mode='level',p =10)
plt.show()
No description has been provided for this image
In [227]:
agglo = AgglomerativeClustering(n_clusters = 2, linkage = 'ward')
standardized_df['Clusters'] = agglo.fit_predict(pca_df)
In [230]:
standardized_df['Clusters'].value_counts()
Out[230]:
Clusters
0    216
1     81
Name: count, dtype: int64
In [232]:
sns.pairplot(standardized_df, hue='Clusters', vars=subset)
plt.show()
No description has been provided for this image
In [233]:
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

axs = axs.flatten()

i = 0

for column in subset:
    sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
    axs[i].set_title(f'Box Plot of {column}')
    axs[i].set_xlabel('Clusters')
    axs[i].set_ylabel(column)

    i = i + 1

plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[233], line 8
      5 i = 0
      7 for column in subset:
----> 8     sns.boxplot(data=standardized_df, y=column, x=standardized_df["Clusters"], ax = axs[i])
      9     axs[i].set_title(f'Box Plot of {column}')
     10     axs[i].set_xlabel('Clusters')

IndexError: index 9 is out of bounds for axis 0 with size 9
No description has been provided for this image
In [ ]:
subset = ['thalach', 'age']
sns.pairplot(data=standardized_df, vars=subset, hue='Clusters', diag_kind='kde', corner=True)
plt.show()
In [ ]:
subset = ['thalach', 'chol']
sns.pairplot(data=standardized_df, vars=subset, hue='Clusters', diag_kind='kde', corner=True)
plt.show()
In [ ]: